HappyDB is a corpus of 100,000 crowd-sourced happy moments via Amazon’s Mechanical Turk. You can read more about it on https://arxiv.org/abs/1801.07746.
Here, we explore this data set and try to answer the question, “What makes people happy?”
From the packages’ descriptions:
tidyverse is an opinionated collection of R packages designed for data science. All packages share an underlying design philosophy, grammar, and data structures;tidytext allows text mining using ‘dplyr’, ‘ggplot2’, and other tidy tools;DT provides an R interface to the JavaScript library DataTables;scales map data to aesthetics, and provide methods for automatically determining breaks and labels for axes and legends;wordcloud2 provides an HTML5 interface to wordcloud for data visualization;gridExtra contains miscellaneous functions for “grid” graphics;ngram is for constructing n-grams (“tokenizing”), as well as generating new text based on the n-gram structure of a given text input (“babbling”);Shiny is an R package that makes it easy to build interactive web apps straight from R;library(tidyverse)
library(tidytext)
library(DT)
library(scales)
library(wordcloud2)
library(gridExtra)
library(ngram)
library(shiny)
library(igraph)
library(ggraph)
We use the processed data for our analysis and combine it with the demographic information available.
hm_data <- read_csv("../output/processed_moments.csv")
urlfile<-'https://raw.githubusercontent.com/rit-public/HappyDB/master/happydb/data/demographic.csv'
demo_data <- read_csv(urlfile)
We select a subset of the data that satisfies specific row conditions.
hm_data <- hm_data %>%
inner_join(demo_data, by = "wid") %>%
select(wid,
original_hm,
gender,
marital,
parenthood,
reflection_period,
age,
country,
ground_truth_category,
text) %>%
mutate(count = sapply(hm_data$text, wordcount)) %>%
filter(gender %in% c("m", "f")) %>%
filter(marital %in% c("single", "married")) %>%
filter(parenthood %in% c("n", "y")) %>%
filter(reflection_period %in% c("24h", "3m")) %>%
mutate(reflection_period = fct_recode(reflection_period,
months_3 = "3m", hours_24 = "24h"))
bag_of_words <- hm_data %>%
unnest_tokens(word, text)
word_count <- bag_of_words %>%
count(word, sort = TRUE)
hm_bigrams <- hm_data %>%
filter(count != 1) %>%
unnest_tokens(bigram, text, token = "ngrams", n = 2)
bigram_counts <- hm_bigrams %>%
separate(bigram, c("word1", "word2"), sep = " ") %>%
count(word1, word2, sort = TRUE)
data <- hm_data[which(hm_data$ground_truth_category != "NA"),]
ggplot(data,aes(x = marital, fill = data$ground_truth_category)) + geom_bar(position = "fill")
ggplot(data,aes(x= marital, y = ground_truth_category)) + geom_jitter(alpha = 0.1)
ggplot(data,aes(x= reflection_period, y = ground_truth_category, color = marital))+ geom_jitter(alpha = 0.4)
ggplot(data,aes(x= country, y = ground_truth_category, color = marital))+ geom_jitter(alpha = 0.4) + theme(axis.text.x = element_text(angle = 90, hjust = 1))
ggplot(data,aes(x= gender, y = ground_truth_category, color = marital))+ geom_jitter(alpha = 0.4)
ggplot(data,aes(x= parenthood, y = ground_truth_category, color = marital))+ geom_jitter(alpha = 0.4)
We want each tab to have its own controls for input and so Shiny’s “navbarPage()” layout works the best. We have the first tab visualizing the overall data, second one for scatterplots comparing the proportion of words within categories, third one highlighting the most frequently appearing bigrams based on categories and the last tab to explore the actual happy moments.
ui <- navbarPage("What makes people happy?",
tabPanel("Overview",
titlePanel(h1("Most Frequent Occurrences",
align = "center")),
sidebarLayout(
sidebarPanel(
sliderInput(inputId = "topWordcloud",
label = "Number of terms for word cloud:",
min = 5,
max = 100,
value = 50),
br(),
br(),
checkboxInput(inputId = "topFreqB",
label = "Plot Bar Chart",
value = F),
sliderInput(inputId = "topBarchart",
label = "Number of terms for bar chart:",
min = 1,
max = 25,
value = 10),
br(),
br(),
checkboxInput(inputId = "topFreqN",
label = "Plot Network Graph",
value = F),
sliderInput(inputId = "topNetwork",
label = "Number of edges for network graph:",
min = 1,
max = 150,
value = 50)
),
mainPanel(
wordcloud2Output(outputId = "WC"),
plotOutput(outputId = "figure")
)
)
),
tabPanel("Individual Terms",
titlePanel(h1("Comparison of Proportions",
align = "center")),
sidebarLayout(
sidebarPanel(
selectInput(inputId = "attribute",
label = "Select the attribute:",
choices = c("Gender" = "gender",
"Marital Status" = "marital",
"Parenthood" = "parenthood",
"Reflection Period" = "reflection_period")
)
),
mainPanel(
plotOutput(outputId = "scatter")
)
)
),
tabPanel("Pair of Words",
titlePanel(h1("Most Frequent Bigrams",
align = "center")),
sidebarLayout(
sidebarPanel(
selectInput(inputId = "factor",
label = "Select the attribute:",
choices = c("Gender" = "gender",
"Marital Status" = "marital",
"Parenthood" = "parenthood",
"Reflection Period" = "reflection_period")
),
numericInput(inputId = "topBigrams",
label = "Number of top pairs to view:",
min = 1,
max = 25,
value = 10)
),
mainPanel(
plotOutput(outputId = "bar")
)
)
),
tabPanel("Data",
DT::dataTableOutput("table")
)
)
This shiny app visualizes summary of data and displays the data table itself.
server <- function(input, output, session) {
pt1 <- reactive({
if(!input$topFreqB) return(NULL)
word_count %>%
slice(1:input$topBarchart) %>%
mutate(word = reorder(word, n)) %>%
ggplot(aes(word, n)) +
geom_col() +
xlab(NULL) +
ylab("Word Frequency")+
coord_flip()
})
pt2 <- reactive({
if(!input$topFreqN) return(NULL)
bigram_graph <- bigram_counts %>%
slice(1:input$topNetwork) %>%
graph_from_data_frame()
set.seed(123)
x <- grid::arrow(type = "closed", length = unit(.1, "inches"))
ggraph(bigram_graph, layout = "fr") +
geom_edge_link(aes(edge_alpha = n), show.legend = FALSE,
arrow = x, end_cap = circle(.05, 'inches')) +
geom_node_point(color = "skyblue", size = 3) +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()
})
output$WC <- renderWordcloud2({
word_count %>%
slice(1:input$topWordcloud) %>%
wordcloud2(size = 0.6,
rotateRatio = 0)
})
output$figure <- renderPlot(height = 500, width = 500, {
ptlist <- list(pt1(),pt2())
ptlist <- ptlist[!sapply(ptlist, is.null)]
if(length(ptlist)==0) return(NULL)
lay <- rbind(c(1,1),
c(2,2))
grid.arrange(grobs = ptlist, layout_matrix = lay)
})
selectedAttribute <- reactive({
list(atr = input$attribute)
})
output$scatter <- renderPlot({
temp <- bag_of_words %>%
count(!!as.name(selectedAttribute()$atr), word) %>%
group_by(!!as.name(selectedAttribute()$atr)) %>%
mutate(proportion = n / sum(n)) %>%
select(-n) %>%
spread(!!as.name(selectedAttribute()$atr), proportion)
ggplot(temp,
aes_string(x = colnames(temp)[2], y = colnames(temp)[3]),
color = abs(colnames(temp)[3] - colnames(temp)[2])) +
geom_abline(color = "gray40", lty = 2) +
geom_jitter(alpha = 0.1, size = 1, width = 0.3, height = 0.3) +
geom_text(aes(label = word), check_overlap = TRUE, vjust = 1.5) +
scale_x_log10(labels = percent_format()) +
scale_y_log10(labels = percent_format()) +
scale_color_gradient(limits = c(0, 0.001), low = "darkslategray4", high = "gray75") +
theme(legend.position="none")
})
selectedBigram <- reactive({
list(var = input$factor)
})
output$bar <- renderPlot({
hm_bigrams %>%
count(!!as.name(selectedBigram()$var), bigram, sort = TRUE) %>%
group_by(!!as.name(selectedBigram()$var)) %>%
top_n(input$topBigrams) %>%
ungroup() %>%
mutate(bigram = reorder(bigram, n)) %>%
ggplot(aes(bigram, n, fill = !!as.name(selectedBigram()$var))) +
geom_col(show.legend = FALSE) +
facet_wrap(as.formula(paste("~", selectedBigram()$var)), ncol = 2, scales = "free") +
coord_flip()
})
output$table <- DT::renderDataTable({
DT::datatable(hm_data)
})
}
shinyApp(ui, server)